import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
# os
folder_path = "./plots"
if not os.path.exists(folder_path) :
os.makedirs(folder_path)
df=pd.read_csv("housing.csv")
df
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY |
| 1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY |
| 2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY |
| 3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY |
| 4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20635 | -121.09 | 39.48 | 25.0 | 1665.0 | 374.0 | 845.0 | 330.0 | 1.5603 | 78100.0 | INLAND |
| 20636 | -121.21 | 39.49 | 18.0 | 697.0 | 150.0 | 356.0 | 114.0 | 2.5568 | 77100.0 | INLAND |
| 20637 | -121.22 | 39.43 | 17.0 | 2254.0 | 485.0 | 1007.0 | 433.0 | 1.7000 | 92300.0 | INLAND |
| 20638 | -121.32 | 39.43 | 18.0 | 1860.0 | 409.0 | 741.0 | 349.0 | 1.8672 | 84700.0 | INLAND |
| 20639 | -121.24 | 39.37 | 16.0 | 2785.0 | 616.0 | 1387.0 | 530.0 | 2.3886 | 89400.0 | INLAND |
20640 rows × 10 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 longitude 20640 non-null float64 1 latitude 20640 non-null float64 2 housing_median_age 20640 non-null float64 3 total_rooms 20640 non-null float64 4 total_bedrooms 20433 non-null float64 5 population 20640 non-null float64 6 households 20640 non-null float64 7 median_income 20640 non-null float64 8 median_house_value 20640 non-null float64 9 ocean_proximity 20640 non-null object dtypes: float64(9), object(1) memory usage: 1.6+ MB
# there is missing value in total_bedrooms
# all the columns floats exept one column( ocean_proximity) is object
# show the null value
df[df.total_bedrooms.isna()]
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 290 | -122.16 | 37.77 | 47.0 | 1256.0 | NaN | 570.0 | 218.0 | 4.3750 | 161900.0 | NEAR BAY |
| 341 | -122.17 | 37.75 | 38.0 | 992.0 | NaN | 732.0 | 259.0 | 1.6196 | 85100.0 | NEAR BAY |
| 538 | -122.28 | 37.78 | 29.0 | 5154.0 | NaN | 3741.0 | 1273.0 | 2.5762 | 173400.0 | NEAR BAY |
| 563 | -122.24 | 37.75 | 45.0 | 891.0 | NaN | 384.0 | 146.0 | 4.9489 | 247100.0 | NEAR BAY |
| 696 | -122.10 | 37.69 | 41.0 | 746.0 | NaN | 387.0 | 161.0 | 3.9063 | 178400.0 | NEAR BAY |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20267 | -119.19 | 34.20 | 18.0 | 3620.0 | NaN | 3171.0 | 779.0 | 3.3409 | 220500.0 | NEAR OCEAN |
| 20268 | -119.18 | 34.19 | 19.0 | 2393.0 | NaN | 1938.0 | 762.0 | 1.6953 | 167400.0 | NEAR OCEAN |
| 20372 | -118.88 | 34.17 | 15.0 | 4260.0 | NaN | 1701.0 | 669.0 | 5.1033 | 410700.0 | <1H OCEAN |
| 20460 | -118.75 | 34.29 | 17.0 | 5512.0 | NaN | 2734.0 | 814.0 | 6.6073 | 258100.0 | <1H OCEAN |
| 20484 | -118.72 | 34.28 | 17.0 | 3051.0 | NaN | 1705.0 | 495.0 | 5.7376 | 218600.0 | <1H OCEAN |
207 rows × 10 columns
# check if there any duplicate rows
df[df.duplicated()]
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity |
|---|
df.describe()
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
|---|---|---|---|---|---|---|---|---|---|
| count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20433.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
| mean | -119.569704 | 35.631861 | 28.639486 | 2635.763081 | 537.870553 | 1425.476744 | 499.539680 | 3.870671 | 206855.816909 |
| std | 2.003532 | 2.135952 | 12.585558 | 2181.615252 | 421.385070 | 1132.462122 | 382.329753 | 1.899822 | 115395.615874 |
| min | -124.350000 | 32.540000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 0.499900 | 14999.000000 |
| 25% | -121.800000 | 33.930000 | 18.000000 | 1447.750000 | 296.000000 | 787.000000 | 280.000000 | 2.563400 | 119600.000000 |
| 50% | -118.490000 | 34.260000 | 29.000000 | 2127.000000 | 435.000000 | 1166.000000 | 409.000000 | 3.534800 | 179700.000000 |
| 75% | -118.010000 | 37.710000 | 37.000000 | 3148.000000 | 647.000000 | 1725.000000 | 605.000000 | 4.743250 | 264725.000000 |
| max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6445.000000 | 35682.000000 | 6082.000000 | 15.000100 | 500001.000000 |
df.describe(include="O")
| ocean_proximity | |
|---|---|
| count | 20640 |
| unique | 5 |
| top | <1H OCEAN |
| freq | 9136 |
df.ocean_proximity.unique()
array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
dtype=object)
# NEAR BAY قرب الخليج
# <1H OCEAN تبعد عن المحيط اقل من ساعة
# INLAND داخل البلاد
# NEAR OCEAN قرب المحيط
# ISLAND جزيرة
df.ocean_proximity.value_counts()
<1H OCEAN 9136 INLAND 6551 NEAR OCEAN 2658 NEAR BAY 2290 ISLAND 5 Name: ocean_proximity, dtype: int64
df.hist(bins=50,figsize=(12,12))
plot_filename = os.path.join(folder_path , "my_plot.png")
plt.savefig(plot_filename)
plt.show()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 longitude 20640 non-null float64 1 latitude 20640 non-null float64 2 housing_median_age 20640 non-null float64 3 total_rooms 20640 non-null float64 4 total_bedrooms 20433 non-null float64 5 population 20640 non-null float64 6 households 20640 non-null float64 7 median_income 20640 non-null float64 8 median_house_value 20640 non-null float64 9 ocean_proximity 20640 non-null object dtypes: float64(9), object(1) memory usage: 1.6+ MB
# dron missing value
df.dropna(inplace=True)
df["rooms_per_household"]=df.total_rooms.div(df.households)
df
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | rooms_per_household | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY | 6.984127 |
| 1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY | 6.238137 |
| 2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY | 8.288136 |
| 3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY | 5.817352 |
| 4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY | 6.281853 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20635 | -121.09 | 39.48 | 25.0 | 1665.0 | 374.0 | 845.0 | 330.0 | 1.5603 | 78100.0 | INLAND | 5.045455 |
| 20636 | -121.21 | 39.49 | 18.0 | 697.0 | 150.0 | 356.0 | 114.0 | 2.5568 | 77100.0 | INLAND | 6.114035 |
| 20637 | -121.22 | 39.43 | 17.0 | 2254.0 | 485.0 | 1007.0 | 433.0 | 1.7000 | 92300.0 | INLAND | 5.205543 |
| 20638 | -121.32 | 39.43 | 18.0 | 1860.0 | 409.0 | 741.0 | 349.0 | 1.8672 | 84700.0 | INLAND | 5.329513 |
| 20639 | -121.24 | 39.37 | 16.0 | 2785.0 | 616.0 | 1387.0 | 530.0 | 2.3886 | 89400.0 | INLAND | 5.254717 |
20433 rows × 11 columns
df.rooms_per_household.nlargest(10)
1914 141.909091 1979 132.533333 12447 62.422222 1913 61.812500 11862 59.875000 1912 56.269231 9676 52.848214 11707 52.690476 2395 50.837838 1240 47.515152 Name: rooms_per_household, dtype: float64
df.rooms_per_household.nsmallest(10)
5916 0.846154 8219 0.888889 3126 1.000000 14818 1.130435 17820 1.130435 4552 1.260870 4550 1.378486 4587 1.411290 4602 1.465753 12484 1.550409 Name: rooms_per_household, dtype: float64
df.loc[[5916,8219,1914,1979]]
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | rooms_per_household | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 5916 | -118.44 | 34.28 | 46.0 | 11.0 | 11.0 | 24.0 | 13.0 | 2.8750 | 162500.0 | <1H OCEAN | 0.846154 |
| 8219 | -118.21 | 33.79 | 33.0 | 32.0 | 18.0 | 96.0 | 36.0 | 4.5938 | 112500.0 | NEAR OCEAN | 0.888889 |
| 1914 | -120.10 | 38.91 | 33.0 | 1561.0 | 282.0 | 30.0 | 11.0 | 1.8750 | 500001.0 | INLAND | 141.909091 |
| 1979 | -120.08 | 38.80 | 34.0 | 1988.0 | 511.0 | 36.0 | 15.0 | 4.6250 | 162500.0 | INLAND | 132.533333 |
df["pop_per_household"]=df.population.div(df.households)
df["bedrooms_per_room"]=df.total_bedrooms.div(df.total_rooms)
df
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | rooms_per_household | pop_per_household | bedrooms_per_room | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY | 6.984127 | 2.555556 | 0.146591 |
| 1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY | 6.238137 | 2.109842 | 0.155797 |
| 2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY | 8.288136 | 2.802260 | 0.129516 |
| 3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY | 5.817352 | 2.547945 | 0.184458 |
| 4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY | 6.281853 | 2.181467 | 0.172096 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20635 | -121.09 | 39.48 | 25.0 | 1665.0 | 374.0 | 845.0 | 330.0 | 1.5603 | 78100.0 | INLAND | 5.045455 | 2.560606 | 0.224625 |
| 20636 | -121.21 | 39.49 | 18.0 | 697.0 | 150.0 | 356.0 | 114.0 | 2.5568 | 77100.0 | INLAND | 6.114035 | 3.122807 | 0.215208 |
| 20637 | -121.22 | 39.43 | 17.0 | 2254.0 | 485.0 | 1007.0 | 433.0 | 1.7000 | 92300.0 | INLAND | 5.205543 | 2.325635 | 0.215173 |
| 20638 | -121.32 | 39.43 | 18.0 | 1860.0 | 409.0 | 741.0 | 349.0 | 1.8672 | 84700.0 | INLAND | 5.329513 | 2.123209 | 0.219892 |
| 20639 | -121.24 | 39.37 | 16.0 | 2785.0 | 616.0 | 1387.0 | 530.0 | 2.3886 | 89400.0 | INLAND | 5.254717 | 2.616981 | 0.221185 |
20433 rows × 13 columns
df.describe()
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | rooms_per_household | pop_per_household | bedrooms_per_room | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 |
| mean | -119.570689 | 35.633221 | 28.633094 | 2636.504233 | 537.870553 | 1424.946949 | 499.433465 | 3.871162 | 206864.413155 | 5.431344 | 3.071533 | 0.213039 |
| std | 2.003578 | 2.136348 | 12.591805 | 2185.269567 | 421.385070 | 1133.208490 | 382.299226 | 1.899291 | 115435.667099 | 2.482946 | 10.438269 | 0.057983 |
| min | -124.350000 | 32.540000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 0.499900 | 14999.000000 | 0.846154 | 0.692308 | 0.100000 |
| 25% | -121.800000 | 33.930000 | 18.000000 | 1450.000000 | 296.000000 | 787.000000 | 280.000000 | 2.563700 | 119500.000000 | 4.441441 | 2.429032 | 0.175427 |
| 50% | -118.490000 | 34.260000 | 29.000000 | 2127.000000 | 435.000000 | 1166.000000 | 409.000000 | 3.536500 | 179700.000000 | 5.230769 | 2.817582 | 0.203162 |
| 75% | -118.010000 | 37.720000 | 37.000000 | 3143.000000 | 647.000000 | 1722.000000 | 604.000000 | 4.744000 | 264700.000000 | 6.052381 | 3.281513 | 0.239821 |
| max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6445.000000 | 35682.000000 | 6082.000000 | 15.000100 | 500001.000000 | 141.909091 | 1243.333333 | 1.000000 |
df
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | rooms_per_household | pop_per_household | bedrooms_per_room | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY | 6.984127 | 2.555556 | 0.146591 |
| 1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY | 6.238137 | 2.109842 | 0.155797 |
| 2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY | 8.288136 | 2.802260 | 0.129516 |
| 3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY | 5.817352 | 2.547945 | 0.184458 |
| 4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY | 6.281853 | 2.181467 | 0.172096 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20635 | -121.09 | 39.48 | 25.0 | 1665.0 | 374.0 | 845.0 | 330.0 | 1.5603 | 78100.0 | INLAND | 5.045455 | 2.560606 | 0.224625 |
| 20636 | -121.21 | 39.49 | 18.0 | 697.0 | 150.0 | 356.0 | 114.0 | 2.5568 | 77100.0 | INLAND | 6.114035 | 3.122807 | 0.215208 |
| 20637 | -121.22 | 39.43 | 17.0 | 2254.0 | 485.0 | 1007.0 | 433.0 | 1.7000 | 92300.0 | INLAND | 5.205543 | 2.325635 | 0.215173 |
| 20638 | -121.32 | 39.43 | 18.0 | 1860.0 | 409.0 | 741.0 | 349.0 | 1.8672 | 84700.0 | INLAND | 5.329513 | 2.123209 | 0.219892 |
| 20639 | -121.24 | 39.37 | 16.0 | 2785.0 | 616.0 | 1387.0 | 530.0 | 2.3886 | 89400.0 | INLAND | 5.254717 | 2.616981 | 0.221185 |
20433 rows × 13 columns
df.median_house_value.hist(bins=100,figsize=(12,8))
path_filename=os.path.join(folder_path,"hist_median_house_value.png")
plt.savefig(path_filename)
plt.show()
# show the corrolation between the columns
df.corr()
C:\Users\fadia\AppData\Local\Temp\ipykernel_38812\4113669569.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. df.corr()
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | rooms_per_household | pop_per_household | bedrooms_per_room | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| longitude | 1.000000 | -0.924616 | -0.109357 | 0.045480 | 0.069608 | 0.100270 | 0.056513 | -0.015550 | -0.045398 | -0.027307 | 0.002304 | 0.092657 |
| latitude | -0.924616 | 1.000000 | 0.011899 | -0.036667 | -0.066983 | -0.108997 | -0.071774 | -0.079626 | -0.144638 | 0.106423 | 0.002522 | -0.113815 |
| housing_median_age | -0.109357 | 0.011899 | 1.000000 | -0.360628 | -0.320451 | -0.295787 | -0.302768 | -0.118278 | 0.106432 | -0.153031 | 0.013258 | 0.136089 |
| total_rooms | 0.045480 | -0.036667 | -0.360628 | 1.000000 | 0.930380 | 0.857281 | 0.918992 | 0.197882 | 0.133294 | 0.133482 | -0.024596 | -0.187900 |
| total_bedrooms | 0.069608 | -0.066983 | -0.320451 | 0.930380 | 1.000000 | 0.877747 | 0.979728 | -0.007723 | 0.049686 | 0.001538 | -0.028355 | 0.084238 |
| population | 0.100270 | -0.108997 | -0.295787 | 0.857281 | 0.877747 | 1.000000 | 0.907186 | 0.005087 | -0.025300 | -0.071898 | 0.070062 | 0.035319 |
| households | 0.056513 | -0.071774 | -0.302768 | 0.918992 | 0.979728 | 0.907186 | 1.000000 | 0.013434 | 0.064894 | -0.080165 | -0.027336 | 0.065087 |
| median_income | -0.015550 | -0.079626 | -0.118278 | 0.197882 | -0.007723 | 0.005087 | 0.013434 | 1.000000 | 0.688355 | 0.325307 | 0.018894 | -0.615661 |
| median_house_value | -0.045398 | -0.144638 | 0.106432 | 0.133294 | 0.049686 | -0.025300 | 0.064894 | 0.688355 | 1.000000 | 0.151344 | -0.023639 | -0.255880 |
| rooms_per_household | -0.027307 | 0.106423 | -0.153031 | 0.133482 | 0.001538 | -0.071898 | -0.080165 | 0.325307 | 0.151344 | 1.000000 | -0.004873 | -0.416952 |
| pop_per_household | 0.002304 | 0.002522 | 0.013258 | -0.024596 | -0.028355 | 0.070062 | -0.027336 | 0.018894 | -0.023639 | -0.004873 | 1.000000 | 0.002938 |
| bedrooms_per_room | 0.092657 | -0.113815 | 0.136089 | -0.187900 | 0.084238 | 0.035319 | 0.065087 | -0.615661 | -0.255880 | -0.416952 | 0.002938 | 1.000000 |
# just we need the correlation between the median house value and the all columns
# sort the correlation from the postive corrlation to negative correlation
df.corr(numeric_only=True).median_house_value.sort_values(ascending=False)
median_house_value 1.000000 median_income 0.688355 rooms_per_household 0.151344 total_rooms 0.133294 housing_median_age 0.106432 households 0.064894 total_bedrooms 0.049686 pop_per_household -0.023639 population -0.025300 longitude -0.045398 latitude -0.144638 bedrooms_per_room -0.255880 Name: median_house_value, dtype: float64
# we can see there is high correlation between median_income and median_house_value
# this mean higher income it will be higher house value
df.median_income.hist(bins=100,figsize=(12,6))
plt.show()
# regression plot betwwen median_income and median_house_value
# scatter plot with linear Regresssion and histgram for each one
sns.set(font_scale=1.5)
sns.jointplot(data=df,x="median_income",y="median_house_value",kind="reg",height=10)
plt.show()
# plot with kernel density estimator مقدر كثافة النواة
sns.set(font_scale=1.5)
sns.jointplot(data=df,x="median_income",y="median_house_value",kind="kde",height=10)
plt.show()
df.plot(kind="scatter",x="longitude",y="latitude",
s=df.population/100,label="Population",figsize=(15,10),
c="median_house_value",cmap="coolwarm",
colorbar=True,alpha=0.4,fontsize=15,sharex=False)
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("Longitude",fontsize=14)
plt.legend(fontsize=16)
plt.show()
import matplotlib.image as mpimg
california_img = mpimg.imread("california.png")
california_img
array([[[0.627451 , 0.7764706, 0.9137255, 1. ],
[0.627451 , 0.7764706, 0.9137255, 1. ],
[0.627451 , 0.7764706, 0.9137255, 1. ],
...,
[0.9647059, 0.827451 , 0.6666667, 1. ],
[0.9647059, 0.827451 , 0.6666667, 1. ],
[0.9647059, 0.827451 , 0.6666667, 1. ]],
[[0.627451 , 0.7764706, 0.9137255, 1. ],
[0.627451 , 0.7764706, 0.9137255, 1. ],
[0.627451 , 0.7764706, 0.9137255, 1. ],
...,
[0.9647059, 0.827451 , 0.6666667, 1. ],
[0.9647059, 0.827451 , 0.6666667, 1. ],
[0.9647059, 0.827451 , 0.6666667, 1. ]],
[[0.627451 , 0.7764706, 0.9137255, 1. ],
[0.627451 , 0.7764706, 0.9137255, 1. ],
[0.627451 , 0.7764706, 0.9137255, 1. ],
...,
[0.9647059, 0.827451 , 0.6666667, 1. ],
[0.9647059, 0.827451 , 0.6666667, 1. ],
[0.9647059, 0.827451 , 0.6666667, 1. ]],
...,
[[0.627451 , 0.7764706, 0.9137255, 1. ],
[0.627451 , 0.7764706, 0.9137255, 1. ],
[0.627451 , 0.7764706, 0.9137255, 1. ],
...,
[0.9647059, 0.827451 , 0.6666667, 1. ],
[0.9647059, 0.827451 , 0.6666667, 1. ],
[0.9647059, 0.827451 , 0.6666667, 1. ]],
[[0.627451 , 0.7764706, 0.9137255, 1. ],
[0.627451 , 0.7764706, 0.9137255, 1. ],
[0.627451 , 0.7764706, 0.9137255, 1. ],
...,
[0.9647059, 0.827451 , 0.6666667, 1. ],
[0.9647059, 0.827451 , 0.6666667, 1. ],
[0.9647059, 0.827451 , 0.6666667, 1. ]],
[[0.627451 , 0.7764706, 0.9137255, 1. ],
[0.627451 , 0.7764706, 0.9137255, 1. ],
[0.627451 , 0.7764706, 0.9137255, 1. ],
...,
[0.9647059, 0.827451 , 0.6666667, 1. ],
[0.9647059, 0.827451 , 0.6666667, 1. ],
[0.9647059, 0.827451 , 0.6666667, 1. ]]], dtype=float32)
plt.figure(figsize=(15,10))
plt.imshow(california_img)
plt.show()
plt.figure(figsize=(15,10))
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05])
plt.show()
df.plot(kind="scatter",x="longitude",y="latitude",
s=df.population/100,c="median_house_value",cmap="coolwarm",
alpha=0.4,sharex=False,colorbar=True,figsize=(15,10),label="Population"
)
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("longitude",fontsize=14)
plt.legend(fontsize=15)
path_filename=os.path.join(folder_path,"california_plot.png")
plt.savefig(path_filename, dpi=300,bbox_inches="tight")
plt.show()
prox= df.ocean_proximity.unique()
prox
array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
dtype=object)
df_loc_near_bay= df[df.ocean_proximity == prox[0]].copy()
df_loc_less_one_hour= df[df.ocean_proximity == prox[1]].copy()
df_loc_inland= df[df.ocean_proximity == prox[2]].copy()
df_loc_near_ocean= df[df.ocean_proximity == prox[3]].copy()
df_loc_island= df[df.ocean_proximity == prox[4]].copy()
df_loc_near_bay.plot(kind="scatter",x="longitude",y="latitude",
s=df_loc_near_bay["population"]/100,c="median_house_value",cmap="coolwarm",
alpha=0.4,sharex=False,colorbar=True,figsize=(15,10),label="Population"
)
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("longitude",fontsize=14)
plt.legend(fontsize=15)
plt.title("Meadian House Value by Near Bay")
path_filename=os.path.join(folder_path,"california_plot_near_bay.png")
plt.savefig(path_filename, dpi=300,bbox_inches="tight")
plt.show()
df_loc_less_one_hour.plot(kind="scatter",x="longitude",y="latitude",
s=df_loc_less_one_hour["population"]/100,c="median_house_value",cmap="coolwarm",
alpha=0.4,sharex=False,colorbar=True,figsize=(15,10),label="Population"
)
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("longitude",fontsize=14)
plt.legend(fontsize=15)
plt.title("Meadian House Value by <1 Houre")
path_filename=os.path.join(folder_path,"california_plot_less_one_houre.png")
plt.savefig(path_filename, dpi=300,bbox_inches="tight")
plt.show()
df_loc_inland.plot(kind="scatter",x="longitude",y="latitude",
s=df_loc_inland["population"]/100,c="median_house_value",cmap="coolwarm",
alpha=0.4,sharex=False,colorbar=True,figsize=(15,10),label="Population"
)
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("longitude",fontsize=14)
plt.legend(fontsize=15)
plt.title("Meadian House Value by Inland")
path_filename=os.path.join(folder_path,"california_plot_inland.png")
plt.savefig(path_filename, dpi=300,bbox_inches="tight")
plt.show()
df_loc_near_ocean.plot(kind="scatter",x="longitude",y="latitude",
s=df_loc_near_ocean["population"]/100,c="median_house_value",cmap="coolwarm",
alpha=0.4,sharex=False,colorbar=True,figsize=(15,10),label="Population"
)
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("longitude",fontsize=14)
plt.legend(fontsize=15)
plt.title("Meadian House Value by Near Ocean")
path_filename=os.path.join(folder_path,"california_plot_near_ocean.png")
plt.savefig(path_filename, dpi=300,bbox_inches="tight")
plt.show()
df_loc_island.plot(kind="scatter",x="longitude",y="latitude",
s=df_loc_island["population"]/100,c="median_house_value",cmap="coolwarm",
alpha=0.4,sharex=False,colorbar=True,figsize=(15,10),label="Population"
)
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("longitude",fontsize=14)
plt.legend(fontsize=15)
plt.title("Meadian House Value by Island")
path_filename=os.path.join(folder_path,"california_plot_island.png")
plt.savefig(path_filename, dpi=300,bbox_inches="tight")
plt.show()
df
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | rooms_per_household | pop_per_household | bedrooms_per_room | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY | 6.984127 | 2.555556 | 0.146591 |
| 1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY | 6.238137 | 2.109842 | 0.155797 |
| 2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY | 8.288136 | 2.802260 | 0.129516 |
| 3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY | 5.817352 | 2.547945 | 0.184458 |
| 4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY | 6.281853 | 2.181467 | 0.172096 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20635 | -121.09 | 39.48 | 25.0 | 1665.0 | 374.0 | 845.0 | 330.0 | 1.5603 | 78100.0 | INLAND | 5.045455 | 2.560606 | 0.224625 |
| 20636 | -121.21 | 39.49 | 18.0 | 697.0 | 150.0 | 356.0 | 114.0 | 2.5568 | 77100.0 | INLAND | 6.114035 | 3.122807 | 0.215208 |
| 20637 | -121.22 | 39.43 | 17.0 | 2254.0 | 485.0 | 1007.0 | 433.0 | 1.7000 | 92300.0 | INLAND | 5.205543 | 2.325635 | 0.215173 |
| 20638 | -121.32 | 39.43 | 18.0 | 1860.0 | 409.0 | 741.0 | 349.0 | 1.8672 | 84700.0 | INLAND | 5.329513 | 2.123209 | 0.219892 |
| 20639 | -121.24 | 39.37 | 16.0 | 2785.0 | 616.0 | 1387.0 | 530.0 | 2.3886 | 89400.0 | INLAND | 5.254717 | 2.616981 | 0.221185 |
20433 rows × 13 columns
df.median_income.hist(bins=50,figsize=(12,6))
plt.title("Median Income")
plt.show()
# transform a numeric column (df.median_income) into categorical values based on quantiles (percentiles).
pd.qcut(df.median_income,q=[0,0.25,0.50,0.75,1])
0 (4.744, 15.0]
1 (4.744, 15.0]
2 (4.744, 15.0]
3 (4.744, 15.0]
4 (3.536, 4.744]
...
20635 (0.499, 2.564]
20636 (0.499, 2.564]
20637 (0.499, 2.564]
20638 (0.499, 2.564]
20639 (0.499, 2.564]
Name: median_income, Length: 20433, dtype: category
Categories (4, interval[float64, right]): [(0.499, 2.564] < (2.564, 3.536] < (3.536, 4.744] < (4.744, 15.0]]
df["income_cat"]=pd.qcut(df.median_income,q=[0,0.25,0.50,0.75,0.95,1],
labels=["Low","Below_Average","Above_Average","High","Very_High"])
df.income_cat
0 Very_High
1 Very_High
2 High
3 High
4 Above_Average
...
20635 Low
20636 Low
20637 Low
20638 Low
20639 Low
Name: income_cat, Length: 20433, dtype: category
Categories (5, object): ['Low' < 'Below_Average' < 'Above_Average' < 'High' < 'Very_High']
df.income_cat.value_counts(normalize=True)
Low 0.250037 Above_Average 0.250037 Below_Average 0.249988 High 0.199922 Very_High 0.050017 Name: income_cat, dtype: float64
# plot the categories
plt.figure(figsize=(12,6))
sns.set(font_scale=1.5)
sns.countplot(data=df,x="income_cat",hue="ocean_proximity")
plt.legend(loc=1)
plt.show()
plt.figure(figsize=(12,6))
sns.set(font_scale=1.5)
sns.barplot(data=df,x="income_cat",y="median_house_value",dodge=True)
plt.show()
plt.figure(figsize=(12,6))
sns.set(font_scale=1.5)
sns.barplot(data=df,x="ocean_proximity",y="median_house_value",dodge=True)
plt.show()
df.groupby(["income_cat","ocean_proximity"]).median_house_value.mean().unstack().drop(columns=["ISLAND"])
| ocean_proximity | <1H OCEAN | INLAND | NEAR BAY | NEAR OCEAN |
|---|---|---|---|---|
| income_cat | ||||
| Low | 161337.076923 | 84820.626650 | 155122.052133 | 148027.826514 |
| Below_Average | 197236.013829 | 115124.088283 | 220196.177656 | 208665.190096 |
| Above_Average | 232278.358759 | 147846.891351 | 261965.251582 | 255293.813584 |
| High | 292208.766217 | 208095.566622 | 322566.033663 | 337446.227778 |
| Very_High | 439784.235489 | 347571.736842 | 451015.078788 | 468739.723270 |
matrix=df.groupby(["income_cat","ocean_proximity"]).median_house_value.mean().unstack().drop(columns=["ISLAND"])
matrix.astype("int")
| ocean_proximity | <1H OCEAN | INLAND | NEAR BAY | NEAR OCEAN |
|---|---|---|---|---|
| income_cat | ||||
| Low | 161337 | 84820 | 155122 | 148027 |
| Below_Average | 197236 | 115124 | 220196 | 208665 |
| Above_Average | 232278 | 147846 | 261965 | 255293 |
| High | 292208 | 208095 | 322566 | 337446 |
| Very_High | 439784 | 347571 | 451015 | 468739 |
plt.figure(figsize=(12,6))
sns.set(font_scale=1.5)
sns.heatmap(matrix.astype(int),cmap="Reds",annot=True,fmt="d",vmin=90000,vmax=470000)
plt.show()
label=df.median_house_value.copy()
label
0 452600.0
1 358500.0
2 352100.0
3 341300.0
4 342200.0
...
20635 78100.0
20636 77100.0
20637 92300.0
20638 84700.0
20639 89400.0
Name: median_house_value, Length: 20433, dtype: float64
features=df.drop(columns=["median_house_value"])
features
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | ocean_proximity | rooms_per_household | pop_per_household | bedrooms_per_room | income_cat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | NEAR BAY | 6.984127 | 2.555556 | 0.146591 | Very_High |
| 1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | NEAR BAY | 6.238137 | 2.109842 | 0.155797 | Very_High |
| 2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | NEAR BAY | 8.288136 | 2.802260 | 0.129516 | High |
| 3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | NEAR BAY | 5.817352 | 2.547945 | 0.184458 | High |
| 4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | NEAR BAY | 6.281853 | 2.181467 | 0.172096 | Above_Average |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20635 | -121.09 | 39.48 | 25.0 | 1665.0 | 374.0 | 845.0 | 330.0 | 1.5603 | INLAND | 5.045455 | 2.560606 | 0.224625 | Low |
| 20636 | -121.21 | 39.49 | 18.0 | 697.0 | 150.0 | 356.0 | 114.0 | 2.5568 | INLAND | 6.114035 | 3.122807 | 0.215208 | Low |
| 20637 | -121.22 | 39.43 | 17.0 | 2254.0 | 485.0 | 1007.0 | 433.0 | 1.7000 | INLAND | 5.205543 | 2.325635 | 0.215173 | Low |
| 20638 | -121.32 | 39.43 | 18.0 | 1860.0 | 409.0 | 741.0 | 349.0 | 1.8672 | INLAND | 5.329513 | 2.123209 | 0.219892 | Low |
| 20639 | -121.24 | 39.37 | 16.0 | 2785.0 | 616.0 | 1387.0 | 530.0 | 2.3886 | INLAND | 5.254717 | 2.616981 | 0.221185 | Low |
20433 rows × 13 columns
features.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 20433 entries, 0 to 20639 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 longitude 20433 non-null float64 1 latitude 20433 non-null float64 2 housing_median_age 20433 non-null float64 3 total_rooms 20433 non-null float64 4 total_bedrooms 20433 non-null float64 5 population 20433 non-null float64 6 households 20433 non-null float64 7 median_income 20433 non-null float64 8 ocean_proximity 20433 non-null object 9 rooms_per_household 20433 non-null float64 10 pop_per_household 20433 non-null float64 11 bedrooms_per_room 20433 non-null float64 12 income_cat 20433 non-null category dtypes: category(1), float64(11), object(1) memory usage: 2.6+ MB
features.select_dtypes("float")
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | rooms_per_household | pop_per_household | bedrooms_per_room | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 6.984127 | 2.555556 | 0.146591 |
| 1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 6.238137 | 2.109842 | 0.155797 |
| 2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 8.288136 | 2.802260 | 0.129516 |
| 3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 5.817352 | 2.547945 | 0.184458 |
| 4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 6.281853 | 2.181467 | 0.172096 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20635 | -121.09 | 39.48 | 25.0 | 1665.0 | 374.0 | 845.0 | 330.0 | 1.5603 | 5.045455 | 2.560606 | 0.224625 |
| 20636 | -121.21 | 39.49 | 18.0 | 697.0 | 150.0 | 356.0 | 114.0 | 2.5568 | 6.114035 | 3.122807 | 0.215208 |
| 20637 | -121.22 | 39.43 | 17.0 | 2254.0 | 485.0 | 1007.0 | 433.0 | 1.7000 | 5.205543 | 2.325635 | 0.215173 |
| 20638 | -121.32 | 39.43 | 18.0 | 1860.0 | 409.0 | 741.0 | 349.0 | 1.8672 | 5.329513 | 2.123209 | 0.219892 |
| 20639 | -121.24 | 39.37 | 16.0 | 2785.0 | 616.0 | 1387.0 | 530.0 | 2.3886 | 5.254717 | 2.616981 | 0.221185 |
20433 rows × 11 columns
import scipy.stats as stats
feat1=features.select_dtypes("float").apply(lambda x:stats.zscore(x))
feat1
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | rooms_per_household | pop_per_household | bedrooms_per_room | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.327314 | 1.051717 | 0.982163 | -0.803813 | -0.970325 | -0.973320 | -0.976833 | 2.345163 | 0.625395 | -0.049433 | -1.146024 |
| 1 | -1.322323 | 1.042355 | -0.606210 | 2.042130 | 1.348276 | 0.861339 | 1.670373 | 2.332632 | 0.324942 | -0.092134 | -0.987254 |
| 2 | -1.332305 | 1.037674 | 1.855769 | -0.535189 | -0.825561 | -0.819769 | -0.843427 | 1.782939 | 1.150594 | -0.025797 | -1.440514 |
| 3 | -1.337296 | 1.037674 | 1.855769 | -0.623510 | -0.718768 | -0.765056 | -0.733562 | 0.932970 | 0.155467 | -0.050162 | -0.492925 |
| 4 | -1.337296 | 1.037674 | 1.855769 | -0.461970 | -0.611974 | -0.758879 | -0.628930 | -0.013143 | 0.342549 | -0.085272 | -0.706141 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20635 | -0.758318 | 1.800677 | -0.288535 | -0.444580 | -0.388895 | -0.511787 | -0.443207 | -1.216727 | -0.155420 | -0.048949 | 0.199820 |
| 20636 | -0.818212 | 1.805358 | -0.844466 | -0.887557 | -0.920488 | -0.943315 | -1.008223 | -0.692044 | 0.274959 | 0.004912 | 0.037412 |
| 20637 | -0.823203 | 1.777272 | -0.923885 | -0.175042 | -0.125472 | -0.368826 | -0.173778 | -1.143171 | -0.090943 | -0.071460 | 0.036808 |
| 20638 | -0.873115 | 1.777272 | -0.844466 | -0.355344 | -0.305834 | -0.603564 | -0.393506 | -1.055136 | -0.041013 | -0.090853 | 0.118204 |
| 20639 | -0.833186 | 1.749186 | -1.003304 | 0.067955 | 0.185416 | -0.033487 | 0.079956 | -0.780606 | -0.071138 | -0.043548 | 0.140495 |
20433 rows × 11 columns
pd.options.display.float_format ="{:.2f}".format
feat1
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | rooms_per_household | pop_per_household | bedrooms_per_room | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.33 | 1.05 | 0.98 | -0.80 | -0.97 | -0.97 | -0.98 | 2.35 | 0.63 | -0.05 | -1.15 |
| 1 | -1.32 | 1.04 | -0.61 | 2.04 | 1.35 | 0.86 | 1.67 | 2.33 | 0.32 | -0.09 | -0.99 |
| 2 | -1.33 | 1.04 | 1.86 | -0.54 | -0.83 | -0.82 | -0.84 | 1.78 | 1.15 | -0.03 | -1.44 |
| 3 | -1.34 | 1.04 | 1.86 | -0.62 | -0.72 | -0.77 | -0.73 | 0.93 | 0.16 | -0.05 | -0.49 |
| 4 | -1.34 | 1.04 | 1.86 | -0.46 | -0.61 | -0.76 | -0.63 | -0.01 | 0.34 | -0.09 | -0.71 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20635 | -0.76 | 1.80 | -0.29 | -0.44 | -0.39 | -0.51 | -0.44 | -1.22 | -0.16 | -0.05 | 0.20 |
| 20636 | -0.82 | 1.81 | -0.84 | -0.89 | -0.92 | -0.94 | -1.01 | -0.69 | 0.27 | 0.00 | 0.04 |
| 20637 | -0.82 | 1.78 | -0.92 | -0.18 | -0.13 | -0.37 | -0.17 | -1.14 | -0.09 | -0.07 | 0.04 |
| 20638 | -0.87 | 1.78 | -0.84 | -0.36 | -0.31 | -0.60 | -0.39 | -1.06 | -0.04 | -0.09 | 0.12 |
| 20639 | -0.83 | 1.75 | -1.00 | 0.07 | 0.19 | -0.03 | 0.08 | -0.78 | -0.07 | -0.04 | 0.14 |
20433 rows × 11 columns
feat1.agg(["mean","std"])
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | rooms_per_household | pop_per_household | bedrooms_per_room | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| mean | -0.00 | 0.00 | 0.00 | -0.00 | -0.00 | -0.00 | -0.00 | 0.00 | -0.00 | -0.00 | 0.00 |
| std | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
# handeling the categorecal column
features.ocean_proximity
0 NEAR BAY
1 NEAR BAY
2 NEAR BAY
3 NEAR BAY
4 NEAR BAY
...
20635 INLAND
20636 INLAND
20637 INLAND
20638 INLAND
20639 INLAND
Name: ocean_proximity, Length: 20433, dtype: object
features.ocean_proximity.value_counts()
<1H OCEAN 9034 INLAND 6496 NEAR OCEAN 2628 NEAR BAY 2270 ISLAND 5 Name: ocean_proximity, dtype: int64
dummies= pd.get_dummies(features.ocean_proximity)
dummies
| <1H OCEAN | INLAND | ISLAND | NEAR BAY | NEAR OCEAN | |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 0 | 0 | 0 | 1 | 0 |
| 3 | 0 | 0 | 0 | 1 | 0 |
| 4 | 0 | 0 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... |
| 20635 | 0 | 1 | 0 | 0 | 0 |
| 20636 | 0 | 1 | 0 | 0 | 0 |
| 20637 | 0 | 1 | 0 | 0 | 0 |
| 20638 | 0 | 1 | 0 | 0 | 0 |
| 20639 | 0 | 1 | 0 | 0 | 0 |
20433 rows × 5 columns
features = pd.concat([feat1,dummies,df.income_cat],axis=1)
features
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | rooms_per_household | pop_per_household | bedrooms_per_room | <1H OCEAN | INLAND | ISLAND | NEAR BAY | NEAR OCEAN | income_cat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.33 | 1.05 | 0.98 | -0.80 | -0.97 | -0.97 | -0.98 | 2.35 | 0.63 | -0.05 | -1.15 | 0 | 0 | 0 | 1 | 0 | Very_High |
| 1 | -1.32 | 1.04 | -0.61 | 2.04 | 1.35 | 0.86 | 1.67 | 2.33 | 0.32 | -0.09 | -0.99 | 0 | 0 | 0 | 1 | 0 | Very_High |
| 2 | -1.33 | 1.04 | 1.86 | -0.54 | -0.83 | -0.82 | -0.84 | 1.78 | 1.15 | -0.03 | -1.44 | 0 | 0 | 0 | 1 | 0 | High |
| 3 | -1.34 | 1.04 | 1.86 | -0.62 | -0.72 | -0.77 | -0.73 | 0.93 | 0.16 | -0.05 | -0.49 | 0 | 0 | 0 | 1 | 0 | High |
| 4 | -1.34 | 1.04 | 1.86 | -0.46 | -0.61 | -0.76 | -0.63 | -0.01 | 0.34 | -0.09 | -0.71 | 0 | 0 | 0 | 1 | 0 | Above_Average |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20635 | -0.76 | 1.80 | -0.29 | -0.44 | -0.39 | -0.51 | -0.44 | -1.22 | -0.16 | -0.05 | 0.20 | 0 | 1 | 0 | 0 | 0 | Low |
| 20636 | -0.82 | 1.81 | -0.84 | -0.89 | -0.92 | -0.94 | -1.01 | -0.69 | 0.27 | 0.00 | 0.04 | 0 | 1 | 0 | 0 | 0 | Low |
| 20637 | -0.82 | 1.78 | -0.92 | -0.18 | -0.13 | -0.37 | -0.17 | -1.14 | -0.09 | -0.07 | 0.04 | 0 | 1 | 0 | 0 | 0 | Low |
| 20638 | -0.87 | 1.78 | -0.84 | -0.36 | -0.31 | -0.60 | -0.39 | -1.06 | -0.04 | -0.09 | 0.12 | 0 | 1 | 0 | 0 | 0 | Low |
| 20639 | -0.83 | 1.75 | -1.00 | 0.07 | 0.19 | -0.03 | 0.08 | -0.78 | -0.07 | -0.04 | 0.14 | 0 | 1 | 0 | 0 | 0 | Low |
20433 rows × 17 columns
test_size=0.2
x_test=features.sample(frac=test_size,random_state=123)
x_test
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | rooms_per_household | pop_per_household | bedrooms_per_room | <1H OCEAN | INLAND | ISLAND | NEAR BAY | NEAR OCEAN | income_cat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 14354 | 1.17 | -1.35 | -0.45 | -0.10 | 0.27 | -0.13 | 0.25 | -0.39 | -0.56 | -0.09 | 0.99 | 0 | 0 | 0 | 0 | 1 | Below_Average |
| 12908 | -0.86 | 1.40 | -0.37 | 0.11 | -0.03 | -0.12 | -0.03 | -0.29 | 0.18 | -0.04 | -0.51 | 0 | 1 | 0 | 0 | 0 | Below_Average |
| 19545 | -0.70 | 0.93 | 0.82 | -0.13 | 0.16 | 0.55 | 0.28 | -1.14 | -0.62 | 0.03 | 0.76 | 0 | 1 | 0 | 0 | 0 | Low |
| 12188 | 1.11 | -0.91 | -1.40 | -0.64 | -0.72 | -0.74 | -0.81 | -0.40 | 0.42 | -0.00 | -0.36 | 1 | 0 | 0 | 0 | 0 | Below_Average |
| 14786 | 1.22 | -1.43 | -0.61 | -0.41 | -0.58 | -0.39 | -0.53 | 0.50 | 0.16 | 0.02 | -0.75 | 0 | 0 | 0 | 0 | 1 | High |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10262 | 0.85 | -0.82 | -0.69 | 0.61 | 0.01 | 0.41 | 0.11 | 1.43 | 0.77 | 0.04 | -1.33 | 1 | 0 | 0 | 0 | 0 | High |
| 3614 | 0.58 | -0.65 | 0.74 | -0.98 | -1.04 | -0.90 | -1.04 | -0.39 | -0.21 | 0.09 | -0.12 | 1 | 0 | 0 | 0 | 0 | Below_Average |
| 19296 | -1.62 | 1.30 | 0.90 | -0.11 | -0.27 | -0.33 | -0.19 | 0.01 | 0.09 | -0.06 | -0.64 | 1 | 0 | 0 | 0 | 0 | Above_Average |
| 5826 | 0.63 | -0.68 | 1.86 | -0.43 | -0.62 | -0.60 | -0.62 | 0.49 | 0.43 | -0.02 | -0.87 | 1 | 0 | 0 | 0 | 0 | High |
| 15383 | 1.18 | -1.05 | -1.16 | 1.26 | 0.67 | 0.82 | 0.81 | 0.62 | 0.49 | -0.02 | -1.05 | 1 | 0 | 0 | 0 | 0 | High |
4087 rows × 17 columns
x_test.income_cat.value_counts(normalize=True)
Above_Average 0.25 Below_Average 0.25 Low 0.25 High 0.20 Very_High 0.05 Name: income_cat, dtype: float64
features.income_cat.value_counts(normalize=True)
Low 0.25 Above_Average 0.25 Below_Average 0.25 High 0.20 Very_High 0.05 Name: income_cat, dtype: float64
x_test.index
Int64Index([14354, 12908, 19545, 12188, 14786, 9941, 3179, 4650, 15550,
17190,
...
3992, 10261, 10862, 10863, 13864, 10262, 3614, 19296, 5826,
15383],
dtype='int64', length=4087)
x_train =features.loc[~features.index.isin(x_test.index)].copy()
x_train.income_cat.value_counts(normalize=True)
Low 0.25 Below_Average 0.25 Above_Average 0.25 High 0.20 Very_High 0.05 Name: income_cat, dtype: float64
x_train= x_train.sample(frac=1,random_state=123)
x_train.drop(columns=["income_cat"],inplace=True)
x_test.drop(columns=["income_cat"],inplace=True)
y_train = label.loc[x_train.index]
y_test =label.loc[x_test.index]
from sklearn.ensemble import RandomForestRegressor
forest_reg=RandomForestRegressor(random_state=42,n_estimators=500,
max_features="sqrt",max_depth=75,min_samples_split=2)
forest_reg.fit(x_train,y_train)
RandomForestRegressor(max_depth=75, max_features='sqrt', n_estimators=500,
random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomForestRegressor(max_depth=75, max_features='sqrt', n_estimators=500,
random_state=42)forest_reg.score(x_train,y_train)
0.9758470860678036
from sklearn.metrics import mean_squared_error
pred = forest_reg.predict(x_train)
pred
array([238374.608, 246813.8 , 74191.4 , ..., 184930.402, 117595.4 ,
187186.008])
forest_mse = mean_squared_error(y_train,pred)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
18023.671210966968
forest_reg
RandomForestRegressor(max_depth=75, max_features='sqrt', n_estimators=500,
random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomForestRegressor(max_depth=75, max_features='sqrt', n_estimators=500,
random_state=42)forest_reg.score(x_test,y_test)
0.825152593253362
pred =forest_reg.predict(x_test)
pred
array([224965.808, 129246.8 , 67903. , ..., 222406.402, 322985.02 ,
268692.8 ])
forest_mse = mean_squared_error(y_test,pred)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
47348.34022326726
comp = pd.DataFrame(data={"True_v":y_test,"Pred":pred})
comp
| True_v | Pred | |
|---|---|---|
| 14354 | 101800.00 | 224965.81 |
| 12908 | 213000.00 | 129246.80 |
| 19545 | 58800.00 | 67903.00 |
| 12188 | 111300.00 | 187250.02 |
| 14786 | 174100.00 | 160228.40 |
| ... | ... | ... |
| 10262 | 272200.00 | 296562.20 |
| 3614 | 175900.00 | 183863.20 |
| 19296 | 215900.00 | 222406.40 |
| 5826 | 326100.00 | 322985.02 |
| 15383 | 291500.00 | 268692.80 |
4087 rows × 2 columns
ae=comp.True_v.sub(comp.Pred).abs()
ae
14354 123165.81
12908 83753.20
19545 9103.00
12188 75950.02
14786 13871.60
...
10262 24362.20
3614 7963.20
19296 6506.40
5826 3114.98
15383 22807.20
Length: 4087, dtype: float64
mae = ae.mean()
mae
31722.632696354292
forest_reg.feature_importances_
array([8.43251997e-02, 7.63824246e-02, 4.20571593e-02, 2.28712182e-02,
2.01285036e-02, 2.28577613e-02, 1.96337714e-02, 2.80813190e-01,
6.50088158e-02, 9.89248218e-02, 9.71199460e-02, 1.91328297e-02,
1.36700488e-01, 2.65441330e-04, 5.35959721e-03, 8.41883258e-03])
feature_imp = pd.Series(data=forest_reg.feature_importances_,
index=x_train.columns).sort_values(ascending=False)
feature_imp
median_income 0.28 INLAND 0.14 pop_per_household 0.10 bedrooms_per_room 0.10 longitude 0.08 latitude 0.08 rooms_per_household 0.07 housing_median_age 0.04 total_rooms 0.02 population 0.02 total_bedrooms 0.02 households 0.02 <1H OCEAN 0.02 NEAR OCEAN 0.01 NEAR BAY 0.01 ISLAND 0.00 dtype: float64
feature_imp.sort_values().plot.barh(figsize=(12,6))
plt.show()